{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Propagate fix to copies columns\n",
    "\n",
    "Having improved the measure of copies in titlemeta.tsv, we now need to propagate that improvement outward to weighted_subset and manual_title_subset.\n",
    "\n",
    "Unfortunately, it's too late to reselect weighted_subset using the improved measure. We'll just have to live with the slight imperfection, which mostly affects the 19c."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "translator = dict()\n",
    "with open('../data/filename_translator.tsv', encoding = 'utf-8') as f:\n",
    "    for line in f:\n",
    "        fields = line.strip().split('\\t')\n",
    "        if fields[0] == 'badname':\n",
    "            continue\n",
    "        else:\n",
    "            translator[fields[0]] = fields[1]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "volume = pd.read_csv('../masterficmetadata.tsv', sep = '\\t', low_memory = False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "def fixdocids(df):\n",
    "    trans = 0\n",
    "    cleaned = 0\n",
    "    \n",
    "    for idx in df.index:\n",
    "        docid = df.loc[idx, 'docid']\n",
    "        if docid in translator:\n",
    "            df.loc[idx, 'docid'] = translator[docid]\n",
    "            trans += 1\n",
    "        elif ':/' in docid:\n",
    "            newdoc = docid.replace(':', '+')\n",
    "            newdoc = newdoc.replace('/', '=')\n",
    "            df.loc[idx, 'docid'] = newdoc\n",
    "            cleaned +=1\n",
    "    \n",
    "    print('Translated: ', trans)\n",
    "    print('Cleaned: ', cleaned)        "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Translated:  1180\n",
      "Cleaned:  658\n"
     ]
    }
   ],
   "source": [
    "fixdocids(volume)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "record = pd.read_csv('../recordmeta.tsv', sep = '\\t', low_memory = False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Translated:  1180\n",
      "Cleaned:  0\n"
     ]
    }
   ],
   "source": [
    "fixdocids(record)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [],
   "source": [
    "title = pd.read_csv('../titlemeta.tsv', sep = '\\t', low_memory = False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Translated:  940\n",
      "Cleaned:  388\n"
     ]
    }
   ],
   "source": [
    "fixdocids(title)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [],
   "source": [
    "weighted = pd.read_csv('../manuallists/weighted_subset.tsv', sep = '\\t', low_memory = False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Translated:  33\n",
      "Cleaned:  0\n"
     ]
    }
   ],
   "source": [
    "fixdocids(weighted)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [],
   "source": [
    "manual = pd.read_csv('../manuallists/manual_title_subset.tsv', sep = '\\t', low_memory = False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Translated:  28\n",
      "Cleaned:  0\n"
     ]
    }
   ],
   "source": [
    "fixdocids(manual)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [],
   "source": [
    "weighted.set_index('docid', inplace = True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [],
   "source": [
    "title.set_index('docid', inplace = True)\n",
    "manual.set_index('docid', inplace = True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [],
   "source": [
    "err = 0\n",
    "for docid in weighted.index:\n",
    "    if docid in title.index:\n",
    "        cop = title.loc[docid, 'allcopiesofwork']\n",
    "        cop25 = title.loc[docid, 'copiesin25yrs']\n",
    "        weighted.loc[docid, 'allcopiesofwork'] = cop\n",
    "        weighted.loc[docid, 'copiesin25yrs'] = cop25\n",
    "    else:\n",
    "        print('error', docid)\n",
    "        err += 1\n",
    "        "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0"
      ]
     },
     "execution_count": 17,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "err"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0"
      ]
     },
     "execution_count": 18,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "err = 0\n",
    "for docid in manual.index:\n",
    "    if docid in title.index:\n",
    "        cop = title.loc[docid, 'allcopiesofwork']\n",
    "        cop25 = title.loc[docid, 'copiesin25yrs']\n",
    "        manual.loc[docid, 'allcopiesofwork'] = cop\n",
    "        manual.loc[docid, 'copiesin25yrs'] = cop25\n",
    "    else:\n",
    "        print('error', docid)\n",
    "        err += 1\n",
    "err"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
   "outputs": [],
   "source": [
    "manual.to_csv('../manuallists/new_manual_title_subset.tsv', sep = '\\t', index_label = 'docid')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {},
   "outputs": [],
   "source": [
    "weighted.to_csv('../manuallists/weighted_subset.tsv', sep = '\\t', index_label = 'docid')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {},
   "outputs": [],
   "source": [
    "title.to_csv('../titlemeta.tsv', sep = '\\t', index_label = 'docid')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {},
   "outputs": [],
   "source": [
    "volume.set_index('docid', inplace = True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {},
   "outputs": [],
   "source": [
    "volume.to_csv('../masterficmetadata.tsv', sep = '\\t', index_label = 'docid')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "metadata": {},
   "outputs": [],
   "source": [
    "record.set_index('docid', inplace = True)\n",
    "record.to_csv('../recordmeta.tsv', sep = '\\t', index_label = 'docid')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Translated:  14\n",
      "Cleaned:  0\n"
     ]
    }
   ],
   "source": [
    "gender = pd.read_csv('../manuallists/gender_balanced_subset.tsv', sep = '\\t', low_memory = False)\n",
    "fixdocids(gender)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "metadata": {},
   "outputs": [],
   "source": [
    "gender.set_index('docid', inplace = True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0"
      ]
     },
     "execution_count": 28,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "err = 0\n",
    "for docid in gender.index:\n",
    "    if docid in title.index:\n",
    "        cop = title.loc[docid, 'allcopiesofwork']\n",
    "        cop25 = title.loc[docid, 'copiesin25yrs']\n",
    "        gender.loc[docid, 'allcopiesofwork'] = cop\n",
    "        gender.loc[docid, 'copiesin25yrs'] = cop25\n",
    "    else:\n",
    "        print('error', docid)\n",
    "        err += 1\n",
    "err"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 29,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Translated:  31\n",
      "Cleaned:  3\n"
     ]
    }
   ],
   "source": [
    "popular = pd.read_csv('../reportcode/most_popular_subset.tsv', sep = '\\t', low_memory = False)\n",
    "fixdocids(popular)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "metadata": {},
   "outputs": [],
   "source": [
    "popular.set_index('docid', inplace = True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 31,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0"
      ]
     },
     "execution_count": 31,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "err = 0\n",
    "for docid in popular.index:\n",
    "    if docid in title.index:\n",
    "        cop = title.loc[docid, 'allcopiesofwork']\n",
    "        cop25 = title.loc[docid, 'copiesin25yrs']\n",
    "        popular.loc[docid, 'allcopiesofwork'] = cop\n",
    "        popular.loc[docid, 'copiesin25yrs'] = cop25\n",
    "    else:\n",
    "        print('error', docid)\n",
    "        err += 1\n",
    "err"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 32,
   "metadata": {},
   "outputs": [],
   "source": [
    "gender.to_csv('../manuallists/gender_balanced_subset.tsv', sep = '\\t', index_label = 'docid')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 33,
   "metadata": {},
   "outputs": [],
   "source": [
    "popular.to_csv('../manuallists/frequently_reprinted_subset.tsv', sep = '\\t', index_label = 'docid')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.5.2"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
